# HTML Parsing. BeautifulSoup

## Скрейпинг и парсинг данных одной страницы

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# web-page 
website = 'https://apteka.de/online-shop/allergie-heuschnupfen'

In [3]:
# get-request to the server
page = requests.get(website)

In [4]:
# check of the response status from the server
page.status_code

200

In [5]:
type(page.text)

str

In [6]:
type(page.content)

bytes

In [7]:
# data parsing
soup = BeautifulSoup(page.content, 'html.parser')

In [32]:
# page.text

In [33]:
# soup

In [14]:
# search by the tag <div>
result = soup.find_all('div', ('class','product-box-container'))

In [17]:
len(result)

24

In [16]:
type(result)

bs4.element.ResultSet

## Скрейпинг ссылок на каждый товар

In [20]:
# First part of the link
url_1 = 'https://apteka.de'

In [23]:
# Extracting list of relative links to products
url_2 = []
for i in result:
    for link in i.find_all('div', ('class', 'product-box__image')):
        url_2.append(link.find('a').get('href'))

In [24]:
len(url_2)

24

In [25]:
import urllib.parse

In [26]:
# Joining 2 urls into absolute path and creating a list with urls to each product, placed at the page
url_joined = []
for link in url_2:
    url_joined.append(urllib.parse.urljoin(url_1, link))

In [27]:
url_joined

['https://apteka.de/online-kaufen/allegra-allergietabletten-20-mg-18113489',
 'https://apteka.de/online-kaufen/allergodil-akut-forte-1-5-mg-ml-nasenspray-loesung-17510656',
 'https://apteka.de/online-kaufen/vividrin-azelastin-edo-bei-heuschnupfen-und-allergien-17538554',
 'https://apteka.de/online-kaufen/desloratadin-adgc-5-mg-filmtabletten-17145955',
 'https://apteka.de/online-kaufen/desloratadin-adgc-5-mg-filmtabletten-17145949',
 'https://apteka.de/online-kaufen/desloratadin-adgc-5-mg-filmtabletten-17145932',
 'https://apteka.de/online-kaufen/abtei-meersalz-nasen-spray-16893833',
 'https://apteka.de/online-kaufen/desloratadin-heumann-5-mg-filmtabletten-16938180',
 'https://apteka.de/online-kaufen/desloratadin-heumann-5-mg-filmtabletten-16938174',
 'https://apteka.de/online-kaufen/desloratadin-heumann-5-mg-filmtabletten-16938145',
 'https://apteka.de/online-kaufen/desloratadin-heumann-5-mg-filmtabletten-16908368',
 'https://apteka.de/online-kaufen/allvent-intens-kapseln-16881391',
 '

## Extracting data of one product

In [28]:
# Link to the first product
first_link = url_joined[0]

In [29]:
# GET-request to server
response = requests.get(first_link)

In [30]:
soup = BeautifulSoup(response.content, 'html.parser')

In [34]:
# soup

In [45]:
# Parsing of product title
name = soup.find('h1', ('class', 'h2 mt-0 sm:mb-6')).text
name

'Allegra Allergietabletten 20 mg 20\xa0St'

In [36]:
import re

In [41]:
form = soup.find('div', string=re.compile("Форма выпуска:"))

In [42]:
form.get_text(strip=True)[15:]

'Tabletten'

In [44]:
# парсинг формы выпуска
form = soup.find('div', string=re.compile("Форма выпуска:")).get_text(strip=True)[15:]
form

'Tabletten'

In [46]:
# Content parsing
content = soup.find('div', string=re.compile('Содержание:')).get_text(strip=True)
content

'Содержание:\n    20\xa0St'

In [47]:
content = int(re.findall(r'\d+', content)[0])
content

20

In [48]:
# Producer parsing
manufacturer = soup.find('div', string=re.compile("Производитель:")).get_text(strip=True)[15:]
manufacturer

'A. Nattermann & Cie GmbH'

In [49]:
# Price parsing
price = soup.find('p', ('class', 'price-details text-grey-darker text-4xl font-sans font-bold mb-0')).find('span').text
price

'7,79\xa0€'

In [50]:
type(price)

str

In [51]:
price = price.replace(',', '.')

In [52]:
price = re.sub(r'[^\d.]+', '', price)
price

'7.79'

In [53]:
float(price)

7.79

## Joining everything - scraping and parsing of all products from the web-page

In [65]:
# Empty lists that will keep data
name = []
price = []
form = []
content = []
manufacturer = []

In [66]:
# Empty dictionary
output = {}

In [67]:
# loop in urls list
for i in url_joined:
    response = requests.get(i)
    soup = BeautifulSoup(response.content, 'html.parser')

# Product title parsing. Handling exception: adding empty line
    try:
        name.append(soup.find('h1', ('class', 'h2 mt-0 sm:mb-6')).text)
    except:
        name.append('')

# Product price parsing
    try:
        p = soup.find('p', ('class', 'price-details text-grey-darker text-4xl font-sans font-bold mb-0')).find('span').text
        p = p.replace(',', '.')
        p = float(re.sub(r'[^\d.]+', '', p))
        price.append(p)
    except:
        price.append('')

# Product type parsing
    try:
        form.append(soup.find('div', string=re.compile("Форма выпуска:")).get_text(strip=True)[15:])
    except:
        form.append('')

# Content parsing
    try:
        cont = soup.find('div', string=re.compile('Содержание:')).get_text(strip=True)
        cont = int(re.findall(r'\d+', cont)[0])
        content.append(cont)
    except:
        content.append('')

# Manufacturer parsing
    try:
        manufacturer.append(soup.find('div', string=re.compile("Производитель:")).get_text(strip=True)[15:])
    except:
        manufacturer.append('')

# Adding data to the dictionary
    output = {'Name': name, 'Price': price, 'Form': form, 'Content': content, 'Manufacturer': manufacturer}

In [68]:
output

{'Name': ['Allegra Allergietabletten 20 mg 20\xa0St',
  'Allergodil akut forte 1,5 mg/ml Nasenspray Lösung 10\xa0ml',
  'Vividrin Azelastin EDO, bei Heuschnupfen und Allergien 20X0.6\xa0ml',
  'Desloratadin-Adgc 5 mg Filmtabletten 100\xa0St',
  'Desloratadin-Adgc 5 mg Filmtabletten 50\xa0St',
  'Desloratadin-Adgc 5 mg Filmtabletten 20\xa0St',
  'Abtei Meersalz Nasen Spray 20\xa0ml',
  'Desloratadin Heumann 5 mg Filmtabletten 100\xa0St',
  'Desloratadin Heumann 5 mg Filmtabletten 50\xa0St',
  'Desloratadin Heumann 5 mg Filmtabletten 20\xa0St',
  'Desloratadin Heumann 5 mg Filmtabletten 10\xa0St',
  'Allvent intens Kapseln 60\xa0St',
  'Levocetirizin Micro Labs 5 mg Filmtabletten 100\xa0St',
  'Levocetirizin Micro Labs 5 mg Filmtabletten 50\xa0St',
  'Levocetirizin Micro Labs 5 mg Filmtabletten 20\xa0St',
  'Mometason beta Heuschnupfenspray 18\xa0g',
  'Mometason beta Heuschnupfenspray 10\xa0g',
  'Desloratadin Glenmark 5 mg Tabletten 7\xa0St',
  'Desloratadin Stada 5 mg Filmtabletten 10

In [69]:
df = pd.DataFrame(output)

In [70]:
df

Unnamed: 0,Name,Price,Form,Content,Manufacturer
0,Allegra Allergietabletten 20 mg 20 St,7.79,Tabletten,20,A. Nattermann & Cie GmbH
1,"Allergodil akut forte 1,5 mg/ml Nasenspray Lös...",9.69,Nasenspray,10,Viatris Healthcare GmbH
2,"Vividrin Azelastin EDO, bei Heuschnupfen und A...",6.99,Augentropfen,20,Dr. Gerhard Mann Chem.-pharm.Fabrik GmbH
3,Desloratadin-Adgc 5 mg Filmtabletten 100 St,15.56,Filmtabletten,100,Zentiva Pharma GmbH
4,Desloratadin-Adgc 5 mg Filmtabletten 50 St,8.32,Filmtabletten,50,Zentiva Pharma GmbH
5,Desloratadin-Adgc 5 mg Filmtabletten 20 St,3.95,Filmtabletten,20,Zentiva Pharma GmbH
6,Abtei Meersalz Nasen Spray 20 ml,4.3,Nasenspray,20,Perrigo Deutschland GmbH
7,Desloratadin Heumann 5 mg Filmtabletten 100 St,39.45,Filmtabletten,100,HEUMANN PHARMA GmbH & Co. Generica KG
8,Desloratadin Heumann 5 mg Filmtabletten 50 St,22.06,Filmtabletten,50,HEUMANN PHARMA GmbH & Co. Generica KG
9,Desloratadin Heumann 5 mg Filmtabletten 20 St,9.97,Filmtabletten,20,HEUMANN PHARMA GmbH & Co. Generica KG


## Pagination - scraping from several pages

In [71]:
name = []
price = []
form = []
content = []
manufacturer = []
output = {}

url_1 = 'https://apteka.de'
url = 'https://apteka.de/online-shop/allergie-heuschnupfen'

while True:
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    next_page_link = soup.find('a', ('class', 'arrow next'))
    result = soup.find_all('div', ('class', 'product-box-container'))

    url_2 = []
    for i in result:
        for link in i.find_all('div', ('class', 'product-box__image')):
            url_2.append(link.find('a').get('href'))

    url_joined = []

    for link in url_2:
        url_joined.append(urllib.parse.urljoin(url_1, link))

    for i in url_joined:
        response = requests.get(i)
        soup = BeautifulSoup(response.content, 'html.parser')

        #name
        try:
            name.append(soup.find('h1', ('class', 'h2 mt-0 sm:mb-6')).text)
        except:
            name.append('')

        #price
        try:
            p = soup.find('p', ('class', 'price-details text-grey-darker text-4xl font-sans font-bold mb-0')).find('span').text
            p = p.replace(',', '.')
            p = float(re.sub(r'[^\d.]+', '', p))
            price.append(p)
        except:
            price.append('')

        #form
        try:
            form.append(soup.find('div', string=re.compile("Форма выпуска:")).get_text(strip=True)[15:])
        except:
            form.append('')

        #content
        try:
            cont = soup.find('div', string=re.compile('Содержание:')).get_text(strip=True)
            cont = int(re.findall(r'\d+', cont)[0])
            content.append(cont)
        except:
            content.append('')

        #manufacturer
        try:
            manufacturer.append(soup.find('div', string=re.compile("Производитель:")).get_text(strip=True)[15:])
        except:
            manufacturer.append('')

        output = {'Name': name, 'Price': price, 'Form': form, 'Content': content, 'Manufacturer': manufacturer}

    if not next_page_link:
        break

    url = url_1 + next_page_link['href']

In [73]:
df = pd.DataFrame(output)
df

Unnamed: 0,Name,Price,Form,Content,Manufacturer
0,Allegra Allergietabletten 20 mg 20 St,7.79,Tabletten,20,A. Nattermann & Cie GmbH
1,"Allergodil akut forte 1,5 mg/ml Nasenspray Lös...",9.69,Nasenspray,10,Viatris Healthcare GmbH
2,"Vividrin Azelastin EDO, bei Heuschnupfen und A...",6.99,Augentropfen,20,Dr. Gerhard Mann Chem.-pharm.Fabrik GmbH
3,Desloratadin-Adgc 5 mg Filmtabletten 100 St,15.56,Filmtabletten,100,Zentiva Pharma GmbH
4,Desloratadin-Adgc 5 mg Filmtabletten 50 St,8.32,Filmtabletten,50,Zentiva Pharma GmbH
...,...,...,...,...,...
139,Livocab direkt Augentropfen bei Allergie 4 ml,8.99,Augentropfen,4,Johnson & Johnson GmbH (OTC)
140,Livocab direkt Nasenspray bei Allergie 5 ml,6.49,Nasenspray,5,Johnson & Johnson GmbH (OTC)
141,Allergodil Augentropfen 6 ml,12.97,Augentropfen,6,Viatris Healthcare GmbH
142,Cetirizin AL 1 mg/ml Sirup 75 ml,6.40,Lösung zum Einnehmen,75,ALIUD Pharma GmbH


In [74]:
df.to_csv('pharmacy.csv')