# Introduction to web scraping using python

### Part 1 - a very rapid introduction to python

#### Basics - variables, indexing, operations

In [17]:
# Variable assignment
x = 2
y = 'hello, world!'
print(x)
print(y)

2
hello, world!


##### Numerics

In [18]:
a = 12.3
b = 2
c = 0.01

print(type(a)) # float
print(type(b)) # int

a + c * b**2 # Some random operation

<class 'float'>
<class 'int'>


12.34

##### Strings

In [19]:
# Data types - strings
my_string = "Hello friends!"
type(my_string)


str

In [20]:
# String operations
firstname = 'Lionel'
lastname = 'Richie'
fullname =  firstname + ' ' + lastname

In [21]:
# Indexing
print(fullname[0])
print(fullname[0:6])
print(fullname[-6:])

L
Lionel
Richie


##### Booleans

In [22]:
z = True
a = False

In [23]:
a == 0 # Booleans will return equality to 1 if True, 0 if False

True

In [24]:
10 > 2

True

#### Data structures

##### Lists

In [25]:
numeric_list = [12, 13, 14, 18, 1, 3, 3]
string_list = ['programming', 'is', 'fun']
mixed_list = [1, '3', None, 2.4]

In [26]:
# List indexation - same as with strings
print(numeric_list[3:8])
print(string_list[0]) 

[18, 1, 3, 3]
programming


In [27]:
# Important! Selecting one element from a list returns the element, selecting multiple elements returns a list
print(type(mixed_list[1]))
print(type(mixed_list[1:3]))

<class 'str'>
<class 'list'>


In [28]:
# Operation on elements obtained through indexing
numeric_list[0] + numeric_list[3] 

30

In [29]:
# This will return an error, since we are extracting a length-1 list instead of the constituent element
numeric_list[0] + numeric_list[3:4]

TypeError: unsupported operand type(s) for +: 'int' and 'list'

In [30]:
# Adding to list
numeric_list + ['a', 'b', 'c']

[12, 13, 14, 18, 1, 3, 3, 'a', 'b', 'c']

In [31]:
# Append adds one element
numeric_list.append(['a', 'b', 'c'])

In [32]:
# Extend adds multiple elements
numeric_list.extend(['x', 'y', 'z'])

##### Dictionaries

In [33]:
contact_details = {'Thom Yorke': 'thom_yorke@gmail.com', 'Johnny Greenwood': 'j_g_w@mail.ru'}
contact_details['Thom Yorke']

'thom_yorke@gmail.com'

In [34]:
# Adding elements
contact_details['Ed Obrien'] = 'ed_obrien@compasslexecon.com'
contact_details

{'Ed Obrien': 'ed_obrien@compasslexecon.com',
 'Johnny Greenwood': 'j_g_w@mail.ru',
 'Thom Yorke': 'thom_yorke@gmail.com'}

#### Control flows

In [35]:
# For loops
integers = list(range(0, 20, 3))
for i in integers:
    print(i, ':', i**2)

0 : 0
3 : 9
6 : 36
9 : 81
12 : 144
15 : 225
18 : 324


In [36]:
# While loops
j = 1
while j < 10:
    print(j/2)
    j += 1

0.5
1.0
1.5
2.0
2.5
3.0
3.5
4.0
4.5


In [37]:
# If-else
integers_2 = list(range(1, 30, 2))
threshold = 5
if len(integers_2) > threshold:
    print('Congrats, your list is longer than {}'.format(threshold))
else:
    print('Too bad, your list is too short')
        

Congrats, your list is longer than 5


### Part 2 - web scraping

In [38]:
url = 'https://www.truckscout24.com/containers/used'

#### Connecting to the webpage

In [39]:
import requests

In [40]:
# Connect to the server, extract the data
r = requests.get(url)
r

<Response [200]>

In [41]:
# Save the content to a variable
data = r.text

In [42]:
# Write the content to a text file (so we don't have to keep making requests)
file = open('containers.txt', 'w', encoding='utf8')
file.write(str(data))
file.close()

#### Making soup

In [43]:
# Import BeautifulSoup
from bs4 import BeautifulSoup

In [47]:
# Parse the HTML, create a BeautifulSoup object
soup = BeautifulSoup(data, 'html.parser')
soup.prettify()

AttributeError: 'str' object has no attribute 'head'

#### "Drilling down" to the data we want

In [65]:
# Drill-down: find the smallest tag which contains the data we are interested in
product_boxes_frame = soup.find('div', attrs={'class': 'articleList'})

In [84]:
# This returns a BeautifulSoup "Tag" object - NB this isn't the "actual" text/content, it is an object
# which lets us do further work on
type(product_boxes_frame)

<class 'bs4.element.Tag'>


[' ',
 ' Container of list items - START ',
 '\n',
 <a href="https://www.truckscout24.com/vehicle-details/Containers-A1-Container-Absetzmulde-7m-Mulde-Rot-Skip-container/18249291/1" onclick="" style="text-decoration: none;"> <!-- one item of list -->
 <div class="listItem topBorder clearfix">
 <div class="floatLeft">
 <div class="listItemSpacer"></div>
 </div>
 <div class="floatLeft listImage">
 <div id="container">
 <div class="image-wrapper">
 <div class="image-container">
 <img class="" src="https://pic.truckscout24.net/images-small/91/92/0018249291001.jpg"/>
 </div>
 </div>
 </div>
 <div class="marginTopFontS paddingBottomM">
 <span class="fontLegal floatLeft">1 Image</span>
 </div>
 </div>
 <div class="floatLeft listData noUnderline">
 <div class="listItemHeader">
 <div class="gridSpan7 gridInner floatRight flexibleGridHeaderHeight">
 <div class="fontDefault cursorPointer">
 <div class="listItemHeaderPrice floatLeft">
 <span>€</span>
 <span class="fontHeadline">1.100,-</span>
 <sp

In [66]:
# Find all product boxes - put into list
product_boxes_list = product_boxes_frame.find_all('div', attrs={'class': 'listItem',
                                'class': 'topBorder',
                                'class': 'clearfix'})

#### Extracting the data - Title

In [87]:
# Inspect first element
print(product_boxes_list[0].prettify())

<div class="listItem topBorder clearfix">
 <div class="floatLeft">
  <div class="listItemSpacer">
  </div>
 </div>
 <div class="floatLeft listImage">
  <div id="container">
   <div class="image-wrapper">
    <div class="image-container">
     <img class="" src="https://pic.truckscout24.net/images-small/91/92/0018249291001.jpg"/>
    </div>
   </div>
  </div>
  <div class="marginTopFontS paddingBottomM">
   <span class="fontLegal floatLeft">
    1 Image
   </span>
  </div>
 </div>
 <div class="floatLeft listData noUnderline">
  <div class="listItemHeader">
   <div class="gridSpan7 gridInner floatRight flexibleGridHeaderHeight">
    <div class="fontDefault cursorPointer">
     <div class="listItemHeaderPrice floatLeft">
      <span>
       €
      </span>
      <span class="fontHeadline">
       1.100,-
      </span>
      <span class="marginLeftS fontDefault cursorPointer">
       (€ 1.309,- Gross price)
      </span>
     </div>
     <div class="listItemHeaderMileage paddingHorizontalM

In [86]:
# This is what we want!
print('Woohoo!')

Woohoo!


In [73]:
# Look at first product box - find title
title = product_boxes_list[0].find('span', attrs={'class': 'fontLoud'})

In [88]:
# Use the ".text" attribute to extract the text relating to the found tag
title.text

'A1 Container - Absetzmulde 7mÂ³ Mulde Rot - Skip...'

In [98]:
# Each tag has different attributes
print("Name:", title.name)
print("Contents:", title.contents)
print("Attrs:", title.attrs)

Name: span
Contents: ['A1 Container - Absetzmulde 7mÂ³ Mulde Rot - Skip...']
Attrs: {'class': ['fontLoud']}


#### Extracting the data - Price

In [109]:
# Find box with prices
price_box = (product_boxes_list[0].
             find('div', attrs={'class': 'listItemHeaderPrice'}).
             find('span', attrs={'class': 'fontHeadline'}))

print(price_box)

<span class="fontHeadline">1.100,-</span>


In [111]:
# A bit of cleaning
price = price_box.text.replace(',-', '').replace('.', '')
price

'1100'

In [113]:
# For each product box, extract information
for prod in product_boxes_list:

    # Title
    title = prod.find('span', attrs={'class': 'fontLoud'}).text
    
    # Price
    price = prod.find('div', attrs={'class': 'listItemHeaderPrice'}).find('span', attrs={'class': 'fontHeadline'}).text
    price = price.replace(',-', '').replace('.', '')
    print(title, price, sep=": ")

A1 Container - Absetzmulde 7mÂ³ Mulde Rot - Skip...: 1100
Sonstige/Other - Onbekend flat, vlakke bak, open...: 1650
Sonstige/Other - Zandt Cargo BDF...: 900
A1 Container - Absetzmulde mit Klappe 7mÂ³ Mulde...: 1250
A1 Container - Unterrahmen Twist-Lock fÃ¼r 20 FuÃ...: 2450
Krone - WP 7.7 N2S-CS Jumbo Edscha Schiebegardine...: 3500
Krone - WP 7.7 N2S-CS Jumbo Edscha Schiebegardine...: 3500
Krone - Schiebeplane mit Edscha Schiebeverdeck...: 5975
Krone - Koffer Glattwandkassette - Swap box: 6850
Sonstige/Other - 20 FuÃ DV Lagercontainer...: 1380
Krone - Koffer Glattwandkassette - Swap box: 6250
Krone - Koffer Glattwandkassette - Swap box: 6250
Krone - Koffer Glattwandkassette - Swap box: 5750
Krone - Koffer Glattwandkassette - Swap box: 8250
Krone - Koffer Glattwandkassette - Swap box: 6950
Krone - Schiebeplane mit Edscha Schiebeverdeck...: 6850
Krone - Koffer Glattwandkassette - Swap box: 6750
Sonstige/Other - Andere Schiebeplane mit Edscha...: 5950
Krone - Koffer Glattwandkassette - S