# Carlink-Scraper

### Setup

In [16]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np
import sqlite3 as sql
import pandas as pd
import time
from itertools import islice

### Defining the search criteria

In [17]:
'''
A list of all brands available on AutoScout is created.
'''

brands = ['audi','bmw', 'ford', 'mercedes-benz', 'opel', 'volkswagen', 'renault', '9ff', 'abarth', 'ac', 'acm', 'acura', 'aixam', 'alfa-romeo', 'alpina', 'alpine', 'amphicar', 'ariel-motor', 'artega',
'aspid', 'aston-martin', 'austin', 'autobianchi', 'auverland', 'baic', 'bedford', 'bellier', 'bentley', 'bolloré', 'borgward', 'brilliance', 'bugatti', 'buick', 'byd', 'cadillac', 'caravans-wohnm', 'casalini',
'caterham', 'changhe', 'chatenet', 'chery', 'chevrolet', 'chrysler', 'citroen', 'cityel', 'cmc', 'corvette', 'courb', 'cupra', 'dacia', 'daewoo', 'daf', 'daihatsu', 'daimler', 'dangel', 'de-tomaso',
'derways', 'dfsk', 'dodge', 'donkervoort', 'dr-motor', 'ds-automobiles', 'dutton', 'e.go', 'estrima', 'ferrari', 'fiat', 'fisker', 'gac-gonow', 'galloper', 'gaz', 'geely', 'gem', 'gemballa', 'genesis',
'gillet', 'giotti-victoria', 'gmc', 'goupil', 'great-wall', 'grevac', 'haima', 'hamann', 'haval', 'honda', 'hummer', 'hurtan', 'hyundai', 'infiniti', 'innocenti', 'iso-rivolta', 'isuzu', 'iveco', 'izh'
'jaguar', 'jeep', 'karabag', 'kia', 'koenigsegg', 'ktm', 'lada', 'lamborghini', 'lancia', 'land-rover', 'ldv', 'lexus', 'lifan', 'ligier', 'lincoln', 'lotus', 'mahindra', 'man', 'mansory', 'martin-motors', 'maserati', 'maxus', 'maybach',
'mazda', 'mclaren', 'melex', 'mg', 'microcar', 'minauto', 'mini', 'mitsubishi', 'mitsuoka', 'morgan', 'moskvich', 'mp-lafer', 'mpm-motors', 'nio', 'nissan', 'oldsmobile', 'oldtimer', 'pagani',
'panther-westwinds', 'peugeot', 'pgo', 'piaggio', 'plymouth', 'polestar', 'pontiac', 'proton', 'puch', 'qoros', 'qvale', 'ram', 'regis', 'reliant', 'renault', 'rolls-royce', 'rover', 'ruf', 'saab',
'santana', 'savel', 'sdg', 'seat', 'shuanghuan', 'skoda', 'smart', 'speedart', 'spyker', 'ssangyong', 'streetscooter', 'subaru', 'suzuki', 'tagaz', 'talbot', 'tasso', 'tata', 'tazzari-ev', 'techart', 'tesla',
'town-life', 'toyota', 'trabant', 'triumph', 'tvr', 'uaz', 'vanderhall', 'vaz', 'vem', 'volvo', 'vortex', 'wallys', 'wartburg', 'westfield', 'wiesmann', 'zastava', 'zaz', 'zhidou', 'zotye', 'others']

'''
A selection of brands is made as most brands contain very little cars.
For the current selection below, only brands with more than 10k cars for 
all geographies and prices ranges combined were included.
'''

brandselection = ['audi','bmw']

In [18]:
'''
A list of all AutoScout markets is created.
'''

countrylist = ['A', 'B', 'D', 'E', 'F', 'I', 'L', 'NL']

'''
A selection of countries is made.
For the current scraping task, we select only Austria.
Dictionary:
'A' refers to Austria
'B' refers to Belgium
'D' refers to Germany
'E' refers to Spain
'F' refers to France
'I' refers to Italy
'L' refers to Luxembourg
'NL' refers to the Netherlands
'''
countryselection = ['D']

In [19]:
'''
Select the price range, and bracket size. The smaller the pricebracket,
the more scrape links have to be scraped but also the smaller the chance that
a given scrape yields more than the maximum of 400 cars and cars are left out
of the sample. For the Austrian market and the selected brands, a price bracket
size of 50 assures all, or very close to all cars in the price range are in our sample.
'''
fromprice = 80000
toprice = 120000
pricebracket = 50

'''
Create the price grid.
'''

prices = np.arange(fromprice, toprice, pricebracket).tolist()

### Creating the search links for the defined search criteria

In [20]:
class Searchlink_generator:
    
    '''
    Searchlink_generator creates the searchlinks from which we can obtain the links to individual cars,
    based on the search criteria defined above.    
    '''

    def createBrandlinks(self):
    
        '''
        This function takes the base url from Autoscout and generates a search link for each selected brand.
    
        '''
        allbrandlinks = []
        for brand in brandselection:
                brandlinks = 'https://www.autoscout24.de/lst/' + brand
                allbrandlinks.append(brandlinks) 

        return allbrandlinks

    def createBrandCountrylinks(self, allbrandlinks):
    
        '''
    
        This function takes the list of brand links and creates for every brand, searchlinks for the selected countries.
    
        '''
        allbrandcountrylinks = []
        for country in countryselection:
            for link in allbrandlinks:
                brandcountrylinks = link + '?sort=price&desc=0&ustate=N%2CU&size=20&cy=' + country
                allbrandcountrylinks.append(brandcountrylinks) 
        return allbrandcountrylinks

    def createBrandCountryPricelinks(self, allbrandcountrylinks):
   
        '''
    
        This function takes the list of brand-country links and creates for every brand-country combination
        searchlinks for each price-bracket.
    
        '''        
        allbrandcountrypricelinks = []
        for price in prices:
            for link in allbrandcountrylinks:
                brandcountrylinks = link + '&pricefrom=' + str(price) + '&priceto=' + str(price+pricebracket-1)
                allbrandcountrypricelinks.append(brandcountrylinks) 
        return allbrandcountrypricelinks

    def createBrandCountryPricePagelinks(self, allbrandcountrypricelinks):
    
        '''
    
        This function takes the list of brand-country-price links and creates for every brand-country-price combination
        searchlinks for each results page, from 1 till the maximum displayed of 20.
    
        '''         
        
        allbrandcountrypricepagelinks = []
        pages = range(1,21)
        for link in allbrandcountrypricelinks:
            for page in pages:
                brandcountrylinks = link + '&page=' + str(page)
                allbrandcountrypricepagelinks.append(brandcountrylinks) 
        return allbrandcountrypricepagelinks

In [21]:
searchlinks = Searchlink_generator()
allbrandcountrypricepagelinks = searchlinks.createBrandCountryPricePagelinks(searchlinks.createBrandCountryPricelinks(searchlinks.createBrandCountrylinks(searchlinks.createBrandlinks())))
allbrandcountrypricepagelinks

['https://www.autoscout24.de/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=D&pricefrom=80000&priceto=80049&page=1',
 'https://www.autoscout24.de/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=D&pricefrom=80000&priceto=80049&page=2',
 'https://www.autoscout24.de/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=D&pricefrom=80000&priceto=80049&page=3',
 'https://www.autoscout24.de/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=D&pricefrom=80000&priceto=80049&page=4',
 'https://www.autoscout24.de/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=D&pricefrom=80000&priceto=80049&page=5',
 'https://www.autoscout24.de/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=D&pricefrom=80000&priceto=80049&page=6',
 'https://www.autoscout24.de/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=D&pricefrom=80000&priceto=80049&page=7',
 'https://www.autoscout24.de/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=D&pricefrom=80000&priceto=80049&page=8',
 'https://www.autoscout24.de/lst

In [22]:
len(allbrandcountrypricepagelinks)

32000

### Saving searchlinks in SQL

In [23]:
database = 'DEdatabase2.db'
connection = sql.connect(database)
cursor = connection.cursor()

In [24]:
searchlinks = pd.DataFrame(allbrandcountrypricepagelinks)
searchlinks.to_sql('DEsportupto120k', connection)

# Scrape and save carlinks by batches

### Read in search links from database

In [25]:
query = '''SELECT * from DEsportupto120k'''
searchlinks = pd.read_sql_query(query, connection).iloc[:,1].values.tolist()

### Scrape the searchlinks for carlinks and store batches

In [26]:
class Carlink_Scraper:    
    '''
    
    Carlink_Scraper loops over all the search links in the searchlinks list and returns all the individual car links
    of each search result. The car links are stored in a local SQL database.

    '''    

    def getSoup(self, link):
        '''
    
        getSoup returns a BeautifulSoup for a given url.

        '''  
        r = requests.get(link)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text,'lxml')

    def getAllLinks(self, link):
        '''
    
        getAllLinks returns all the individual car links it can find on a given search link.

        '''
        soup = self.getSoup(link)
        tds = soup.findAll('div', {'class':'cldt-summary-titles'})
        return ['https://www.autoscout24.de' + td.find('a')['href'] for td in tds]
    

    def carlinkScraper(self):
        '''
    
        carlinkScraper loops over all searchlinks, looks for individual car links and stores them in a list.
        Every 1000th iteration of the loop, that is for every 1000th search link, the list is stored in a
        local SQL database. For efficiency, the function checks whether there are the maximum of 20 cars on
        the 2nd searchpage of each search. If so, it continues its search for the next 18 pages, but if not,
        it will skip the next 18 pages and continue with the next search. After every time the data are stored
        in SQL, they are deleted from Python to free up memory. A tracker is added that counts the number of
        search links that have been scraped, which helps to track progress and allows to continue at the last
        scraped search link in case the process is interrupted.

        '''        
        start = time.time()
        allcarlinks = []
        tracker = 0
        iterator = iter(searchlinks)
        for link in iterator:
            carlinks = self.getAllLinks(link)
            tracker = tracker + 1
            if not not carlinks:
                allcarlinks.extend(carlinks)
            if link[-2:] == '=2' and len(carlinks) !=20:
                next(islice(iterator, 17, 18), None)
                tracker = tracker + 18
            if tracker % 1000 == 0:
                autolinks = pd.DataFrame(allcarlinks)
                autolinks.to_sql('DEsportlinksupto100k', connection, if_exists= 'append')
                allcarlinks = []

        autolinks = pd.DataFrame(allcarlinks)
        autolinks.to_sql('DEsportlinksupto100k', connection, if_exists= 'append')
        allcarlinks = []    

        print(tracker)
        end = time.time()
        print(end - start)

In [27]:
getallcarlinks = Carlink_Scraper()
getallcarlinks.carlinkScraper()

32000
1630.6742599010468
