<a href="https://colab.research.google.com/github/ricardocarreras/Bootcamp_DataScience_student/blob/master/EQC_scraping_part_III.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

In [None]:
# Paths
filename = 'EQC'   # e.g. name of the car brand or type you search for
path = r'/content/' + filename + '.csv'   # select path for saving data

In [None]:
# URL of first result oage of search
# Go to https://www.autoscout24.de and enter your search terms. Select order by age descending.
start_page = 'https://www.autoscout24.de/lst/mercedes-benz/eqc-400?sort=standard&desc=0&ustate=N%2CU&size=20&page=1&cy=D&atype=C&fc=0&qry=EQC&'

# Existing data of previous search?
# New results will be appended.
try:
    df = pd.read_excel(path)
except FileNotFoundError:
    df = pd.DataFrame()

In [None]:
# Loop over 100 result pages
for i in range(1,100):
    # Read single page
    print('Reading SRP ' + str(i) + '.')
    # Split up url of start page to fill in current value of i.
    response = requests.get(start_page.split('page=1')[0] + 'page=' + str(i) + start_page.split('page=1')[1])
    html = response.text
    
    doc = BeautifulSoup(html, 'html.parser')
    
    # Get urls of all results on current page.
    offer_list = []
    for paragraph in doc.find_all('a'):
        # Only interested in actual offers (angebote), not in leasing nor recommendation
        if r'/angebote/' in str(paragraph.get('href')) and r'/leasing/' not in str(paragraph.get('href')) and r'/recommendation/' not in str(paragraph.get('href')):
            offer_list.append(paragraph.get('href'))
    
    # Drop urls that were already crawled. These are in df["url"], if df exists from preivous search.
    offer_list_unreduced = offer_list # Just for checking
    try:
        offer_list = [item for item in offer_list if 'https://www.autoscout24.de' + item not in list(df["url"])]
    except:
        print("First results for this search.")

    # Loop over offers.
    for item in offer_list:
        try:
            url = 'https://www.autoscout24.de' + item
            response = requests.get(url)
            html = response.text
            
            doc = BeautifulSoup(html, 'html.parser')
            
            # Empty dictionary for saving car's main features
            car_dict = {}
            
            # Names of main features are within dt tags of html. Their value is always in the following dd tag.
            for key, value in zip(doc.find_all('dt'), doc.find_all('dd')): # Combine every dt tag with the following dd tag by zip.
                car_dict[key.text.replace("\n", "")] = value.text.replace("\n", "") # Save in dict.
            
            # Following features must be identified separateley.
            
            # professional seller?
            car_dict['haendler'] = doc.find("div", attrs={"class":"cldt-vendor-contact-box",
                                                          "data-vendor-type":"dealer"}) != None
            
            # private seller?
            car_dict['privat'] = doc.find("div", attrs={"class":"cldt-vendor-contact-box",
                                                          "data-vendor-type":"privateseller"}) != None
            # city of sale incl. zip-code
            car_dict['ort'] = doc.find("div", attrs={"class":"sc-grid-col-12",
                                                          "data-item-name":"vendor-contact-city"}).text
            # driven miles
            car_dict['miles'] = html.split('"stmil" : ')[1].replace("\n", '').split(',')[0].strip()
            
            # price
            car_dict['price'] = "".join(re.findall(r'[0-9]+',doc.find("div",attrs={"class":"cldt-price"}).text))
            
            # save url and time of program's execution
            car_dict['url'] = url
            car_dict['date'] = datetime.now().strftime("%Y-%m-%d")
            car_dict['time'] = datetime.now().strftime("%H-%M-%S")
            
            # add several features that have no value. These either exist in the current car or not (e.g. air-con, radio, leather seatings, etc.)
            for j in doc.find_all('div', attrs={"class":"cldt-equipment-block sc-grid-col-3 sc-grid-col-m-4 sc-grid-col-s-12 sc-pull-left"}):
                for span in j.find_all('span'):
                    car_dict[span.text] = 1 # assign value of 1 if feature exists.

            # append data of current car to dataframe
            car_car_dict = {}
            car_car_dict['URL'] = car_dict
            df_append = pd.DataFrame(car_car_dict).T
            df=df.append(df_append)   
            
        except Exception as e:
            print(str(e))

    print('Appended data from SRP ' + str(i) + '.')

Reading SRP 1.
First results for this search.
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
Appended data from SRP 1.
Reading SRP 2.
Appended data from SRP 2.
Reading SRP 3.
Appended data from SRP 3.
Reading SRP 4.
Appended data from SRP 4.
Reading SRP 5.
Appended data from SRP 5.
Reading SRP 6.
Appended data from SRP 6.
Reading SRP 7.
'NoneType' object has no attribute 'text'
Appended data from SRP 7.
Reading SRP 8.
Appended data from SRP 8.
Reading SRP 9.
Appended data from SRP 9.
Reading SRP 10.
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
Appended data from SRP 10.
Reading SRP 11.
Appended data from SRP 11.
Reading SRP 12.
Appended data from SRP 12.
Reading S

In [None]:
df

Unnamed: 0,360° Kamera,ABS,Abstandstempomat,Abstandswarner,Allrad,Alufelgen,Ambientebeleuchtung,Android Auto,Angebotsnummer,Anhängerkupplung,Antriebsart,Anzahl Türen,Apple CarPlay,Armlehne,Außenfarbe,Beheizbares Lenkrad,Beifahrerairbag,Berganfahrassistent,Blendfreies Fernlicht,Bluetooth,Bordcomputer,"CO2-EmissionenWeitere Informationen zum offiziellen Kraftstoffverbrauch und den offiziellen spezifischen CO2-Emissionen neuer Personenkraftwagen können dem ""Leitfaden über den Kraftstoffverbrauch, die CO2-Emissionen und den Stromverbrauch neuer Personenkraftwagen"" entnommen werden, der an allen Verkaufsstellen und bei der Deutschen Automobil Treuhand GmbH unter www.dat.de unentgeltlich erhältlich ist.",DAB-Radio,ESP,Einparkhilfe Kamera,Einparkhilfe Sensoren hinten,Einparkhilfe Sensoren vorne,Einparkhilfe selbstlenkendes System,Elektr. Fensterheber,Elektrische Heckklappe,Elektrische Seitenspiegel,Elektrische Sitze,Erstzulassung,Fahrerairbag,Fahrzeughalter,Farbe laut Hersteller,Feinstaubplakette,Fernlichtassistent,Freisprecheinrichtung,Garantie,...,Nebelscheinwerfer,Sportfahrwerk,Sportpaket,Sportsitze,Gänge,TV,Kopfairbag,Zentralverriegelung mit Funkfernbedienung,Xenonscheinwerfer,Bi-Xenon Scheinwerfer,2-Zonen-Klimaautomatik,Einparkhilfe,Standheizung,3-Zonen-Klimaautomatik,Induktionsladen für Smartphones,Taxi oder Mietwagen,Winterpaket,Dachreling,E10-geeignet,Winterreifen,Nachtsicht-Assistent,Umklappbarer Beifahrersitz,Windschott(für Cabrio),Zylinder,Stahlfelgen,4-Zonen-Klimaautomatik,Airbag hinten,Massagesitze,Laserlicht,Notrad,Scheinwerferreinigung,Reichweitenverlängerer,Elektronische Parkbremse,CD,CO2-Emissionen,Beheizbare Frontscheibe,Luftfederung,Panoramadach,Verfügbarkeit,"Elektr. Sitzeinstellung, hinten"
URL,1,1,1,1,1,1,1,1,710966,1,Allrad,5,1,1,Silber,1,1,1,1,1,1,0 g/km (komb),1,1,1,1,1,1,1,1,1,1,2021,1,1,hightechsilber,4 (Grün),1,1,24 Monate,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
URL,,1,,1,1,1,1,,7164,1,Allrad,5,,1,Grau,,1,1,1,1,1,0 g/km (komb),1,1,1,1,1,1,1,1,1,1,2020,1,1,graphitgrau,4 (Grün),1,1,24 Monate,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
URL,1,1,1,1,1,1,1,1,1P007657,1,Allrad,5,1,1,Schwarz,1,,,,1,1,0 g/km (komb),1,1,1,1,1,,1,1,1,1,2021,,1,obsidianschwarz metallic,4 (Grün),1,1,12 Monate,...,1,1,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
URL,1,1,1,1,1,1,1,1,K31164,1,Allrad,5,1,1,Silber,,1,1,1,1,1,0 g/km (komb),1,1,1,1,1,1,1,1,1,1,2019,1,1,hightechsilber,4 (Grün),1,1,0 Monate,...,,,,,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
URL,,1,1,,1,1,,,206459,,Allrad,5,,1,Schwarz,1,1,1,,1,1,0 g/km (komb),1,1,,1,1,1,1,1,1,1,2021,1,1,schwarz,4 (Grün),,1,,...,,1,1,1,,,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
URL,1,1,1,1,1,1,,1,,1,Allrad,4,1,1,Schwarz,,1,1,1,1,1,0 g/km (komb),1,1,1,1,1,1,1,1,1,1,,1,,197 obsidianschwarzmetallic,4 (Grün),,1,,...,,,1,,,,,,,,1,,1,,,,,,,,,,,,,,,,,,,,,,,,1,,In 5 Tagen ab Bestellung,
URL,1,1,,1,1,1,1,1,20G0869,,Allrad,4,1,1,Schwarz,,,,1,1,1,0 g/km (komb),,1,1,1,1,1,1,1,1,1,,,,Schwarz Uni,,1,1,,...,1,1,,1,,,,,,,,,,,1,,,,,,,,,,,,,,,,1,,,1,,,,,In 60 Tagen ab Bestellung,
URL,1,1,,1,1,1,1,1,21G0642,,Allrad,4,1,1,Grau,,,,1,1,1,0 g/km (komb),,1,1,1,1,1,1,1,1,1,,,,Designo Selenitgrau Magno,,1,1,,...,1,1,,1,,,,,,,,,,,1,,,,,,,,,,,,,,,,1,,,1,,,,,,
URL,,1,1,,,,,,128,,,4,,,Silber,,1,,,,,0 g/km (komb),,1,,,,,1,,,1,2020,1,,HIGH-TECH-SILBER METALLIC,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
# save dataframe as csv
df.to_csv(path, index=False, sep=",")

In [None]:
df.to_parquet(path)

In [None]:
df.shape

(153, 173)