In [6]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

# Rank brands scraping

We start scraping the brands rank in the next url: http://en.classora.com/reports/v46699/ranking-of-the-worlds-most-popular-car-brands

In [4]:
def brands_rank_scraper():
    
    """    
    This function creates a csv file which contains a the name of the most famous brands in the world with their ratings.
    """
 
    #The 2 links in the catalog
    rank_url_1 = "http://en.classora.com/reports/v46699/ranking-of-the-worlds-most-popular-car-brands?id=466&groupCount=50&startIndex=1"
    rank_url_2 = "http://en.classora.com/reports/v46699/ranking-of-the-worlds-most-popular-car-brands?id=466&groupCount=50&startIndex=51"
    
    rank_urls = [rank_url_1, rank_url_2]
    final_rank_list = []
    
    #Foreach url, que take the brands and their score and we store them in a final list
    for url in rank_urls:
        rank_req = requests.get(url)
        cars_soup = BeautifulSoup(rank_req.text, "html.parser")

        brand_cells = cars_soup.find_all("td", {"class": "rankingEntryCell"})
        points_cells = cars_soup.find_all("td", {"class": "rankingDataCell"})
        
        brand_list = [[brand_cells[index].get_text().lower(), points_cells[index].get_text()] for index, val in enumerate(brand_cells)]   
        final_rank_list += brand_list
     
    #Finally, the information is stored in a csv file
    rank_df = pd.DataFrame(final_rank_list, columns=['brand', 'score'])
    rank_df.to_csv("../data/brands_rank.csv", sep = ';', index = False)
    
    #We print a message advertaising to the user that the process has finished
    print("The process has finished successfully")

In [5]:
brands_rank_scraper()

The process has finished successfully


# Cars scraping

Finally, we start the cars web scraping the next web: 

In [9]:
def cars_scraper():
    
    """    
    This function creates a csv file which contains all the information we want of the cars in the catalog.
    """
    
    #==========================================================================
    # Obtention of the advertisements url
    #==========================================================================
    
    # webpage url
    url = "https://www.coches.com/coches-segunda-mano/coches-ocasion.htm"
    
    # Obtention of the data
    req = requests.get(url)
    cars_soup = BeautifulSoup(req.text, "html.parser")
    
    # We obtain the number of ads in the catalog
    total_number_ads_block = str(cars_soup.findChild("div", {"class": "informacion"})\
                                 .find("strong").contents[0])
    total_number_ads = int(total_number_ads_block.replace('.', ''))
    
    # Now we find the number of pages
    ads_per_page = len(cars_soup.find_all("div", {"class": "oferta"}))
    number_of_pages = round(total_number_ads / ads_per_page)
    
    #==========================================================================
    # Creation of the csv file
    #==========================================================================
    
    # Declaration of the variables
            
    ## General variables declaration
    title, brand, province, price, year, km, fuel, type_gears, seller, guarantee, colour = \
            "", "", "", "", "", "", "", "", "", "", ""

    ## Technical variables declaration
        
    ### Measurements and weights
    boot_cap, length, height, width, doors, vacancies, tank, weight, max_weight, car_type = \
            "", "", "", "", "", "", "", "", "", ""
        
    ### Use and features
    max_speed, comb_fuel, urb_use, extraurb_use, acel, autonomy, c02_emissions = "", "", "", "", "", "", ""
        
    ### Engine and transmission
    output, cubic_cap, cylinders, max_par, gears, transm, tract = "", "", "", "", "", "", ""
            
    ## Variables to build the table
    header = ['Title', 'Brand', 'Province', 'Price', 'Year', 'Kms', 'Fuel', 'Type of Gears', 'Seller', 'Guarantee', 
                'Colour', 'Boot Capacity', 'Length', 'Height', 'Width', 'Doors', 'Vacancies', 'Tank', 'Weight', 
                'Max Weight', 'Type', 'Max Speed', 'Comb Fuel', 'Urban Use', 'Extraurban Use', 'Aceleration', 
                'Autonomy', 'CO2 Emissions', 'Output', 'Cubic Capacity', 'Cylinders', 'Max Par', 'Gears', 'Transmission',
                'Traction', 'Url']
    body = []
    
    # Creation of the table using the information of each page
    for i in range(1, number_of_pages):
        url_with_page = 'https://www.coches.com/coches-segunda-mano/coches-ocasion.htm?page=%d' %i
        sub_req = requests.get(url_with_page, allow_redirects = False)

        # If the status code is 200, the resource has been found
        if sub_req.status_code == 200:
            info_soup = BeautifulSoup(sub_req.text, "html.parser")
            # Obtention of the url of each car
            offers = info_soup.findChildren("div", {"class": "oferta"})

            for o in offers:
                
                #Obtention of the information  
                link = o.find("a", href=True)["href"]
                link_req = requests.get(link)
                link_soup = BeautifulSoup(link_req.text, "html.parser") 

                ## 'div' where the general information is located
                div_car_info = link_soup.find_all("div", {"class": "col-lg-12 hidden-md hidden-sm hidden-xs"}) 
                
                 ## 'div' where the measures information is located
                div_measures_info = link_soup.find_all("div", {"class": "col-lg-12 col-md-12 col-sm-12 col-xs-12 hidden-xs hidden-sm"}) 

                ## 'div' where the performance information is located
                div_performance = link_soup.find_all("div", {"class": "row hidd"})
                
                 ## 'div' where the engine information is located
                div_engine = link_soup.find_all("div", {"class": "col-lg-12 col-md-12 col-sm-12 col-xs-12 hidden-xs hidden-sm"})
                
                ## Now we obtain the features
                if link_soup.findChild("ol", {"class": "breadcrumb"}) != None:
                    title = link_soup.findChild("ol", {"class": "breadcrumb"}).find("span").get_text()
                    brand = title.split(" ")[0]
                if link_soup.findChild("h1", {"class": "cc_model_price"}) != None:
                    province = link_soup.findChild("h1", {"class": "cc_model_price"}).find_all("small")[1].get_text()       
                if len(div_car_info) > 0:
                    for div in div_car_info:
                        divs = div.find_all("div", {"class": "cc_car_data"})
                        for i in divs:
                            if i.find("small").get_text() == "Precio":
                                price = i.find("strong").get_text()
                            if i.find("small").get_text() == "Año":
                                year = i.find("strong").get_text()
                            if i.find("small").get_text() == "Kilómetros":
                                km = i.find("strong").get_text()
                            if i.find("small").get_text() == "Combustible":
                                fuel = i.find("strong").get_text()
                            if i.find("small").get_text() == "Cambio":
                                type_gears = i.find("strong").get_text()
                            if i.find("small").get_text() == "Vendedor":
                                seller = i.find("strong").get_text()
                            if i.find("small").get_text() == "Garantía":
                                guarantee = i.find("strong").get_text()
                            if i.find("small").get_text() == "Color exterior":
                                colour = i.find("strong").get_text()
                if len(div_measures_info) > 0:
                    for div in div_measures_info:
                        divs = div.find_all("div", {"class": "cc_car_data"})
                        for i in divs:
                            if i.find("small").get_text() == "Capacidad maletero":
                                boot_cap = i.find("strong").get_text()
                            if i.find("small").get_text() == "Longitud":
                                length = i.find("strong").get_text()
                            if i.find("small").get_text() == "Altura":
                                height = i.find("strong").get_text()
                            if i.find("small").get_text() == "Anchura":
                                width = i.find("strong").get_text()
                            if i.find("small").get_text() == "Núm. puertas":
                                doors = i.find("strong").get_text()
                            if i.find("small").get_text() == "Núm. plazas":
                                vacancies = i.find("strong").get_text()
                            if i.find("small").get_text() == "Capacidad depósito":
                                tank = i.find("strong").get_text()
                            if i.find("small").get_text() == "Peso":
                                weight = i.find("strong").get_text()
                            if i.find("small").get_text() == "Peso máx autorizado":
                                max_weight = i.find("strong").get_text()
                            if i.find("small").get_text() == "Carrocería":
                                car_type = i.find("strong").get_text()
                if len(div_performance) > 0:
                    for div in div_performance:
                        divs = div.find_all("div", {"class": "cc_car_data"}) 
                        for i in divs:
                            if i.find("small").get_text() == "Velocidad máxima":
                                max_speed = i.find("strong").get_text()
                            if i.find("small").get_text() == "Consumo combinado":
                                comb_fuel = i.find("strong").get_text()
                            if i.find("small").get_text() == "Consumo urbano":
                                urb_use = i.find("strong").get_text()
                            if i.find("small").get_text() == "Consumo extraurbano":
                                extraurb_use = i.find("strong").get_text()
                            if i.find("small").get_text() == "Aceleración 0-100":
                                acel = i.find("strong").get_text()
                            if i.find("small").get_text() == "Autonomía":
                                autonomy = i.find("strong").get_text()
                            if i.find("small").get_text() == "Emisión co2":
                                c02_emissions = i.find("strong").get_text()
                if len(div_engine) > 0:
                    for div in div_engine:
                        divs = div.find_all("div", {"class": "cc_car_data"})
                        for i in divs:
                            if i.find("small").get_text() == "Potencia":
                                output = i.find("strong").get_text()
                            if i.find("small").get_text() == "Cilindrada":
                                cubic_cap = i.find("strong").get_text()
                            if i.find("small").get_text() == "Número cilindros":
                                cylinders = i.find("strong").get_text()
                            if i.find("small").get_text() == "Par máximo":
                                max_par = i.find("strong").get_text()
                            if i.find("small").get_text() == "Núm. marchas":
                                gears = i.find("strong").get_text()
                            if i.find("small").get_text() == "Transmisión":
                                transm = i.find("strong").get_text()
                            if i.find("small").get_text() == "Tracción":
                                tract = i.find("strong").get_text()
                            
                ## We append all the features to the body
                body.append([title, brand, province, price, year, km, fuel, type_gears, seller, guarantee, colour, boot_cap, length,
                            height, width, doors, vacancies, tank, weight, max_weight, car_type, max_speed, comb_fuel, urb_use,
                            extraurb_use, acel, autonomy, c02_emissions, output, cubic_cap, cylinders, max_par, gears, transm,
                            tract, link])
     
    ## CSV file building
    cars_df = pd.DataFrame(body, columns=header)
    cars_df.to_csv('../data/cars_data.csv', sep = ';', index = False, encoding = 'utf-8')
    
    #We finish the process printing a message to the user
    print("The process has finished successfully")
    print("%d ads has been stored in the dataset" %np.shape(cars_df)[0])

In [None]:
cars_scraper()