# Introduction to web scraping using python

### Part 1 - a very rapid introduction to python

#### Basics - variables, indexing, operations

In [None]:
# Variable assignment
x = 2
y = 'sup, bro'
print(x)
print(y)

##### Numerics

In [7]:
a = 12.3
b = 2
c = 0.01

print(type(a)) # float
print(type(b)) # int

a + c * b**2 # Some random operation

<class 'float'>
<class 'int'>


12.34

##### Strings

In [None]:
# Data types - strings
my_string = "Hello friends!"
type(my_string)


In [None]:
# String operations
firstname = 'Lionel'
lastname = 'Richie'
fullname =  firstname + ' ' + lastname

In [None]:
# Indexing
print(fullname[0])
print(fullname[0:6])
print(fullname[-6:])

##### Booleans

In [18]:
z = True
a = False

In [19]:
a == 0 # Booleans will return equality to 1 if True, 0 if False

True

In [23]:
10 > 2

True

#### Data structures

##### Lists

In [2]:
numeric_list = [12, 13, 14, 18, 1, 3, 3]
string_list = ['programming', 'is', 'fun']
mixed_list = [1, '3', None, 2.4]

In [3]:
# List indexation - same as with strings
print(numeric_list[3:8])
print(string_list[0]) 

[18, 1, 3, 3]
programming


In [None]:
# Important! Selecting one element from a list returns the element, selecting multiple elements returns a list
print(type(mixed_list[1]))
print(type(mixed_list[1:3]))

In [None]:
# Operation on elements obtained through indexing
numeric_list[0] + numeric_list[3] 

In [None]:
# This will return an error, since we are extracting a length-1 list instead of the constituent element
numeric_list[0] + numeric_list[3:4]

##### Dictionaries

In [27]:
contact_details = {'Thom Yorke': 'thom_yorke@gmail.com', 'Johnny Greenwood': 'j_g_w@mail.ru'}
contact_details['Thom Yorke']

'thom_yorke@gmail.com'

In [30]:
# Adding elements
contact_details['Ed Obrien'] = 'ed_obrien@compasslexecon.com'
contact_details

{'Ed Obrien': 'ed_obrien@compasslexecon.com',
 'Johnny Greenwood': 'j_g_w@mail.ru',
 'Thom Yorke': 'thom_yorke@gmail.com'}

#### Control flows

In [38]:
# For loops
integers = list(range(0, 20, 3))
for i in integers:
    print(i, ':', i**2)

0 : 0
3 : 9
6 : 36
9 : 81
12 : 144
15 : 225
18 : 324


In [47]:
# While loops
j = 1
while j < 10:
    print(j/2)
    j += 1

0.5
1.0
1.5
2.0
2.5
3.0
3.5
4.0
4.5


In [55]:
# If-else
integers_2 = list(range(1, 30, 2))
threshold = 5
if len(integers_2) > threshold:
    print('Congrats, your list is longer than {}'.format(threshold))
else:
    print('Too bad, your list is too short')
        

Congrats, your list is longer than 5


### Part 2 - web scraping

In [86]:
url = 'https://www.truckscout24.com/containers/used'

#### Connecting to the webpage

In [58]:
import requests

In [87]:
# Connect to the server, extract the data
r = requests.get(url)
r

<Response [200]>

In [88]:
# Save the content to a variable
data = r.text

In [93]:
# Write the content to a text file (so we don't have to keep making requests)
file = open('containers.txt', 'w', encoding='utf8')
file.write(str(data))
file.close()

#### Making soup

In [94]:
# Import BeautifulSoup
from bs4 import BeautifulSoup

In [95]:
# Parse the HTML, create a BeautifulSoup object
soup = BeautifulSoup(data, 'html.parser')
soup.prettify()

In [101]:
# Drill-down: find the smallest tag which contains the data we are interested in, return list
listings = soup.find('div', attrs={'class': 'articleList'})
len(listings)

87

In [99]:
listings

<div class="articleList"> <!-- Container of list items - START -->
<a href="https://www.truckscout24.com/vehicle-details/Containers-Krone-WP-7-7-N2S-CS-Jumbo-Edscha-Schiebegardine-Hubdach-Swap-body-tarpaulin/18246483/1" onclick="" style="text-decoration: none;"> <!-- one item of list -->
<div class="listItem topBorder clearfix">
<div class="floatLeft">
<div class="listItemSpacer"></div>
</div>
<div class="floatLeft listImage">
<div id="container">
<div class="image-wrapper">
<div class="image-container">
<img class="" src="https://pic.truckscout24.net/images-small/83/64/0018246483001.jpg"/>
</div>
</div>
</div>
<div class="marginTopFontS paddingBottomM">
<span class="fontLegal floatLeft">9 Images</span>
</div>
</div>
<div class="floatLeft listData noUnderline">
<div class="listItemHeader">
<div class="gridSpan7 gridInner floatRight flexibleGridHeaderHeight">
<div class="fontDefault cursorPointer">
<div class="listItemHeaderPrice floatLeft">
<span>€</span>
<span class="fontHeadline">3.5