In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
with open('soup.html') as f:
    soup = BeautifulSoup(f,'lxml')

soup    

<html>
<head>
</head><body>
<h1> Understanding Beautiful Soup</h1>
<h2> ML Life Cycle</h2>
<ol id="ordered list">
<li>Exploratory Data Analysis</li>
<li>Feature Engineering</li>
<li>Model Training and Testing</li>
<li>Model Deployment</li>
</ol>
</body>
</html>

In [3]:
# finding tags by name
soup.find('ol')

<ol id="ordered list">
<li>Exploratory Data Analysis</li>
<li>Feature Engineering</li>
<li>Model Training and Testing</li>
<li>Model Deployment</li>
</ol>

In [4]:
# if there are multiple similar tags, search using specific attributes
# finding tags by attributes
soup.find(attrs={'id':'ordered list'})

<ol id="ordered list">
<li>Exploratory Data Analysis</li>
<li>Feature Engineering</li>
<li>Model Training and Testing</li>
<li>Model Deployment</li>
</ol>

In [5]:
# finding all items
# extracts all items regardless of how deeply nested they are
items = soup.find_all('li', recursive=True)
items = [item.get_text() for item in items]
items

['Exploratory Data Analysis',
 'Feature Engineering',
 'Model Training and Testing',
 'Model Deployment']

## Example 1: Extracting tabular data from Wikipedia

In [6]:
# requests library helps in downloading source code of the url
url = 'https://en.wikipedia.org/wiki/List_of_most-followed_Instagram_accounts'
result = requests.get(url)

# checking status code whether downloading was successful or not
result

<Response [200]>

In [7]:
soup = BeautifulSoup(result.content, 'lxml')
table = soup.find('table')
rows = table.find_all('tr')

len(rows)

52

In [8]:
def get_data(row):
    items = [] 
    for item in row:
        items.append(item.get_text().strip())    
    return items


data = []
for row in rows[1:-1]:
    row = row.find_all(['td','th'])
    data.append(get_data(row))

In [9]:
df = pd.DataFrame(data, columns = ['rank','username','owner','followers','profession/activity','country/continent'])
df.head()

Unnamed: 0,rank,username,owner,followers,profession/activity,country/continent
0,1,@instagram,Instagram,437,Social media platform,United States
1,2,@cristiano,Cristiano Ronaldo,361,Footballer,Portugal
2,3,@leomessi,Lionel Messi,278,Footballer,Argentina
3,4,@kyliejenner,Kylie Jenner,278,"Television personality, model, and businesswoman",United States
4,5,@therock,Dwayne Johnson,276,Actor and professional wrestler,United States


In [10]:
df.to_csv('instagram.csv',index=False)

## Example 2: Extracting articles from webpage

In [11]:
url = 'https://coreyms.com/'
result = requests.get(url)
result

<Response [200]>

In [12]:
soup = BeautifulSoup(result.content, 'lxml')
articles = soup.find_all('article')

len(articles)

10

In [13]:
def get_details(article):
    title = article.find_all('h2')[0].get_text()
    link = article.find_all('h2')[0].a.get('href')
    description = article.find_all('div')[0].get_text().strip()

    return [title, link, description]

data = []
for article in articles:
    data.append(get_details(article))

In [14]:
df = pd.DataFrame(data, columns = ['title','link','description'])
df.head()

Unnamed: 0,title,link,description
0,Python Tutorial: Zip Files – Creating and Extr...,https://coreyms.com/development/python/python-...,"In this video, we will be learning how to crea..."
1,Python Data Science Tutorial: Analyzing the 20...,https://coreyms.com/development/python/python-...,"In this Python Programming video, we will be l..."
2,Python Multiprocessing Tutorial: Run Code in P...,https://coreyms.com/development/python/python-...,"In this Python Programming video, we will be l..."
3,Python Threading Tutorial: Run Code Concurrent...,https://coreyms.com/development/python/python-...,"In this Python Programming video, we will be l..."
4,Update (2019-09-03),https://coreyms.com/general/update-2019-09-03,Hey everyone. I wanted to give you an update o...
