# Introduction to web scraping using python

### Part 1 - a very rapid introduction to python

#### Basics - variables, indexing, operations

In [None]:
# Variable assignment
x = 2
y = 'hello, world!'
print(x)
print(y)

##### Numerics

In [None]:
a = 12.3
b = 2
c = 0.01

print(type(a)) # float
print(type(b)) # int

a + c * b**2 # Some random operation

##### Strings

In [None]:
# Data types - strings
my_string = "Hello friends!"
type(my_string)


In [None]:
# String operations
firstname = 'Lionel'
lastname = 'Richie'
fullname =  firstname + ' ' + lastname

In [None]:
# Indexing
print(fullname[0])
print(fullname[0:6])
print(fullname[-6:])

##### Booleans

In [None]:
z = True
a = False

In [None]:
a == 0 # Booleans will return equality to 1 if True, 0 if False

In [None]:
10 > 2

#### Data structures

##### Lists

In [None]:
numeric_list = [12, 13, 14, 18, 1, 3, 3]
string_list = ['programming', 'is', 'fun']
mixed_list = [1, '3', None, 2.4]

In [None]:
# List indexation - same as with strings
print(numeric_list[3:8])
print(string_list[0]) 

In [None]:
# Important! Selecting one element from a list returns the element, selecting multiple elements returns a list
print(type(mixed_list[1]))
print(type(mixed_list[1:3]))

In [None]:
# Operation on elements obtained through indexing
numeric_list[0] + numeric_list[3] 

In [None]:
# This will return an error, since we are extracting a length-1 list instead of the constituent element
numeric_list[0] + numeric_list[3:4]

In [None]:
# Adding to list
numeric_list + ['a', 'b', 'c']

In [None]:
# Append adds one element
numeric_list.append(['a', 'b', 'c'])

In [None]:
# Extend adds multiple elements
numeric_list.extend(['x', 'y', 'z'])

##### Dictionaries

In [None]:
contact_details = {'Thom Yorke': 'thom_yorke@gmail.com', 'Johnny Greenwood': 'j_g_w@mail.ru'}
contact_details['Thom Yorke']

In [None]:
# Adding elements
contact_details['Ed Obrien'] = 'ed_obrien@compasslexecon.com'
contact_details

#### Control flows

In [None]:
# For loops
integers = list(range(0, 20, 3))
for i in integers:
    print(i, ':', i**2)

In [None]:
# While loops
j = 1
while j < 10:
    print(j/2)
    j += 1

In [None]:
# If-else
integers_2 = list(range(1, 30, 2))
threshold = 5
if len(integers_2) > threshold:
    print('Congrats, your list is longer than {}'.format(threshold))
else:
    print('Too bad, your list is too short')
        

### Part 2 - web scraping

In [None]:
url = 'https://www.truckscout24.com/containers/used'

#### Connecting to the webpage

In [None]:
import requests

In [None]:
# Connect to the server, extract the data
r = requests.get(url)
r

In [None]:
# Save the content to a variable
data = r.text

In [None]:
# Write the content to a text file (so we don't have to keep making requests)
file = open('containers.txt', 'w', encoding='utf8')
file.write(str(data))
file.close()

#### Making soup

In [None]:
# Import BeautifulSoup
from bs4 import BeautifulSoup

In [None]:
# Parse the HTML, create a BeautifulSoup object
soup = BeautifulSoup(data, 'html.parser')
#soup.prettify()

#### "Drilling down" to the data we want

In [None]:
# Drill-down: find the smallest tag which contains the data we are interested in
product_boxes_frame = soup.find('div', attrs={'class': 'articleList'})

In [None]:
# This returns a BeautifulSoup "Tag" object - NB this isn't the "actual" text/content, it is an object
# which lets us do further work on
type(product_boxes_frame)

In [None]:
# Find all product boxes - put into list
product_boxes_list = product_boxes_frame.find_all('div', attrs={'class': 'listItem',
                                'class': 'topBorder',
                                'class': 'clearfix'})

#### Extracting the data - Title

In [None]:
# Inspect first element
print(product_boxes_list[0].prettify())

In [None]:
# This is what we want!
print('Woohoo!')

In [None]:
# Look at first product box - find title
title = product_boxes_list[0].find('span', attrs={'class': 'fontLoud'})

In [None]:
# Each tag has different attributes
print("Name:", title.name)
print("Contents:", title.contents)
print("Attrs:", title.attrs)

In [None]:
# Use the ".text" attribute to extract the text relating to the found tag
title.text

#### Extracting the data - Price

In [None]:
# Find box with prices
price_box = (product_boxes_list[0].
             find('div', attrs={'class': 'listItemHeaderPrice'}).
             find('span', attrs={'class': 'fontHeadline'}))

print(price_box)

In [None]:
# A bit of cleaning
price = price_box.text.replace(',-', '').replace('.', '')
price

#### Extract data for each entry

In [None]:
# Initiate an empty list to populate with our data
output = []

In [None]:
# For each product box, extract information, add to list
for prod in product_boxes_list:

    # Title
    title = prod.find('span', attrs={'class': 'fontLoud'}).text
    
    # Price
    price = prod.find('div', attrs={'class': 'listItemHeaderPrice'}).find('span', attrs={'class': 'fontHeadline'}).text
    price = price.replace(',-', '').replace('.', '')
    print(title, price, sep=": ")
    
    # Anything else you want to add?
    
    # Append this pair to list
    output.append([title, price])


In [None]:
# Inspect output
output

#### Exporting output

In [None]:
# Quick and dirty: Write list output to file
import csv
with open('web_scraped_data.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(output)

In [None]:
# More robust: write output to DataFrame
import pandas as pd
df = pd.DataFrame.from_records(output)
df.columns = ['name', 'price']
df.head()

In [None]:
df.to_excel('web_scraped_data.xlsx')